# WORDSCORES
library(dplyr)
library(quanteda)
library(fixest)
library(quanteda.textplots)
library(quanteda.textmodels)
library(stringr)

# LOAD DATA
load("path to data_final.rda")

# PARTY

df_ft_all_combined_wordscore <- df_final
# Ensure the 'date' column is of class Date
df_ft_all_combined_wordscore$date <- as.Date(df_ft_all_combined_wordscore$date)

# # Apply the conditions within the specified date range
df_ft_all_combined_wordscore$reference_wordscore <- ifelse(!is.na(df_ft_all_combined_wordscore$date) &
                                                             df_ft_all_combined_wordscore$party %in% c(7, 13),
                                                           ifelse(df_ft_all_combined_wordscore$party == 7, 1, -1),
                                                           NA)

df_ft_all_combined_wordscore$speech <- df_ft_all_combined_wordscore$speech_edit

# # Making df of reference docs
reference_docs_df <- df_final %>%
  filter(date >= as.Date("1974-01-01") & date <= as.Date("1992-01-01"))
reference_docs_df$reference_wordscore <- NA
reference_docs_df$reference_wordscore <- ifelse(reference_docs_df$party == 7, 1,
                                                ifelse(reference_docs_df$party %in% c(13), -1,
                                                       reference_docs_df$reference_wordscore))

reference_docs_df$market <- str_count(reference_docs_df$tokens_edit, "markedsøkonomi|frie marked|markedsreform") #Finder strings, der indeholder word
reference_docs_df <- filter(reference_docs_df,
                            market > 0)

# Subset for only market economy speeches
df_ft_all_combined_wordscore$market <- str_count(df_ft_all_combined_wordscore$tokens_edit, "markedsøkonomi|frie marked|markedsreform") #Finder strings, der indeholder word
df_ft_all_combined_wordscore <- filter(df_ft_all_combined_wordscore, 
                                       market > 0)

reference_docs <- df_ft_all_combined_wordscore[!is.na(df_ft_all_combined_wordscore$reference_wordscore), ]

df_ft_all_combined_wordscore <- df_ft_all_combined_wordscore %>%
  filter(date >= as.Date("1985-01-01") & date <= as.Date("1992-01-01"))

# Iterate through each row of the data frame and remove the first speaker occurrence
df_ft_all_combined_wordscore$speaker <- str_replace_all(df_ft_all_combined_wordscore$speaker, "\\)", "")
df_ft_all_combined_wordscore$speaker <- str_replace_all(df_ft_all_combined_wordscore$speaker, "\\(", "")
df_ft_all_combined_wordscore$speaker <- str_replace_all(df_ft_all_combined_wordscore$speaker, "\\{", "")
df_ft_all_combined_wordscore$speaker <- str_replace_all(df_ft_all_combined_wordscore$speaker, "\\[", "")
df_ft_all_combined_wordscore$speaker <- str_replace_all(df_ft_all_combined_wordscore$speaker, "\\]", "")
df_ft_all_combined_wordscore$speaker <- str_replace_all(df_ft_all_combined_wordscore$speaker, "\\}", "")
df_ft_all_combined_wordscore$speaker <- str_replace_all(df_ft_all_combined_wordscore$speaker, "\\\\", "")
df_ft_all_combined_wordscore$speaker <- str_replace_all(df_ft_all_combined_wordscore$speaker, "\\*", "")
df_ft_all_combined_wordscore$speaker <- str_replace_all(df_ft_all_combined_wordscore$speaker, "\\?", "")


for (i in 1:nrow(df_ft_all_combined_wordscore)) {
  speaker <- df_ft_all_combined_wordscore$speaker[i]
  speech <- df_ft_all_combined_wordscore$speech[i]
  
  # Find the position of the speaker in speech
  speaker_position <- str_locate(speech, speaker)
  
  if (!is.na(speaker_position[1, 1])) {
    # Get the position of the first occurrence of the speaker
    start_pos <- speaker_position[1, 1]
    end_pos <- speaker_position[1, 2]
    
    # Remove the first occurrence of speaker in speech
    df_ft_all_combined_wordscore$speech[i] <- paste0(substr(speech, 1, start_pos - 1), substr(speech, end_pos + 1, nchar(speech)))
  }
}

# Iterate through each row of the data frame
for (i in 1:nrow(df_ft_all_combined_wordscore)) {
  speech <- df_ft_all_combined_wordscore$speech[i]
  
  # Check if ":" is within the first 30 characters
  if (str_detect(substr(speech, 1, 70), ":")) {
    # Remove characters up to and including the first ":"
    df_ft_all_combined_wordscore$speech[i] <- substr(speech, str_locate(speech, ":")[1, 2] + 1, nchar(speech))
  }
}


# Iterate through each row of the data frame
for (i in 1:nrow(df_ft_all_combined_wordscore)) {
  speech <- df_ft_all_combined_wordscore$speech[i]
  
  # Check if ":" is within the first 30 characters
  if (str_detect(substr(speech, 1, 70), "\\)\\.")) {
    # Remove characters up to and including the first ":"
    df_ft_all_combined_wordscore$speech[i] <- substr(speech, str_locate(speech, ":")[1, 2] + 1, nchar(speech))
  }
}

# Make duplicates of references docs
reference_docs_df$duplicate_wordscore <- 1

df_ft_all_combined_wordscore <- bind_rows(df_ft_all_combined_wordscore, reference_docs_df)

# Remove rows with wordscore that are now duplicated and should not be used for referencing
df_ft_all_combined_wordscore <- df_ft_all_combined_wordscore %>%
  filter(!(is.na(duplicate_wordscore) & !is.na(reference_wordscore)))

df_ft_all_combined_wordscore <- df_ft_all_combined_wordscore %>%
  mutate(party = ifelse(party == 7 | party == 13, paste0(party, "_ref"), party))

# Convert the 'date' column to Date class if it's not already
df_ft_all_combined_wordscore$date <- as.Date(df_ft_all_combined_wordscore$date)

# Create a new column 'post_1989' and update 'party' accordingly
df_ft_all_combined_wordscore <- df_ft_all_combined_wordscore %>%
  mutate(post_1989 = ifelse(date > as.Date("1989-11-08"), TRUE, FALSE),
         party = ifelse(post_1989, paste0(party, "_1"), party))

library(quanteda)

# Tokenize the 'speaker' column
tokens_speaker <- tokens(df_ft_all_combined_wordscore$speaker)

# Remove all other variables except "party" and "speech" and "reference_wordscore"
df_ft_all_combined_wordscore <- df_ft_all_combined_wordscore %>%
  select(party, speech, reference_wordscore, duplicate_wordscore)

#Group by grouped_party
grouped_df_wordscore <- df_ft_all_combined_wordscore %>%
  group_by(party) %>%
  summarize(concatenated_speech = paste(speech, collapse = " "), 
            reference_wordscore = first(reference_wordscore))

# Reset the row names
rownames(grouped_df_wordscore) <- NULL

corp1 <- corpus(grouped_df_wordscore, text_field = "concatenated_speech")
docnames(corp1) <- grouped_df_wordscore$party

toks_final <- tokens(corp1, remove_punct = TRUE)
#toks_grouped <- tokens_group(toks_final, groups = grouped_party)

# Convert the list of speaker tokens into a character vector
tokens_vector <- unlist(tokens_speaker)

# Remove the speaker tokens from the dfm
dfmat_final <- dfm(toks_final) %>% 
  dfm_remove(pattern = tokens_vector) %>%
  dfm_remove(pattern = stopwords("da"))

tmod_ws <- textmodel_wordscores(dfmat_final, y = corp1$reference_wordscore, smooth = 1)
summary(tmod_ws)

pred_ws <- predict(tmod_ws, se.fit = TRUE, newdata = dfmat_final)
pred_ws

pred_ws$fit <- pred_ws$fit
pred_ws$se.fit <- pred_ws$se.fit

textplot_scale1d(pred_ws)

setwd("FOLDER PATH")
save(pred_ws, file = "wordscores_parties.RData")

# COMBINED - EVERYONE GROUPED TOGETHER

df_ft_all_combined_wordscore <- df_final
# Removing chair
df_ft_all_combined_wordscore <- subset(df_ft_all_combined_wordscore, party != 1)


# Ensure the 'date' column is of class Date
df_ft_all_combined_wordscore$date <- as.Date(df_ft_all_combined_wordscore$date)

df_ft_all_combined_wordscore$speech <- df_ft_all_combined_wordscore$speech_edit

# # Apply the conditions within the specified date range
df_ft_all_combined_wordscore$reference_wordscore <- ifelse(!is.na(df_ft_all_combined_wordscore$date) &
                                                             df_ft_all_combined_wordscore$party %in% c(7, 13),
                                                           ifelse(df_ft_all_combined_wordscore$party == 7, 1, -1),
                                                           NA)

df_ft_all_combined_wordscore$speech <- df_ft_all_combined_wordscore$speech_edit

# # Making df of reference docs
reference_docs_df <- df_final %>%
  filter(date >= as.Date("1974-01-01") & date <= as.Date("1992-01-01"))
reference_docs_df$reference_wordscore <- NA
reference_docs_df$reference_wordscore <- ifelse(reference_docs_df$party == 7, 1,
                                                ifelse(reference_docs_df$party %in% c(13), -1,
                                                       reference_docs_df$reference_wordscore))
reference_docs_df$market <- str_count(reference_docs_df$tokens_edit, "markedsøkonomi|frie marked|markedsreform") #Finder strings, der indeholder word
reference_docs_df <- filter(reference_docs_df,
                            market > 0)


df_ft_all_combined_wordscore$speech <- df_ft_all_combined_wordscore$speech_edit

# Subset for only market economy speeches
df_ft_all_combined_wordscore$market <- str_count(df_ft_all_combined_wordscore$tokens_edit, "markedsøkonomi|frie marked|markedsreform") #Finder strings, der indeholder word
df_ft_all_combined_wordscore <- filter(df_ft_all_combined_wordscore, 
                                       market > 0)


reference_docs <- df_ft_all_combined_wordscore[!is.na(df_ft_all_combined_wordscore$reference_wordscore), ]

df_ft_all_combined_wordscore <- df_ft_all_combined_wordscore %>%
  filter(date >= as.Date("1985-01-01") & date <= as.Date("1992-01-01"))

# Iterate through each row of the data frame and remove the first speaker occurrence
df_ft_all_combined_wordscore$speaker <- str_replace_all(df_ft_all_combined_wordscore$speaker, "\\)", "")
df_ft_all_combined_wordscore$speaker <- str_replace_all(df_ft_all_combined_wordscore$speaker, "\\(", "")
df_ft_all_combined_wordscore$speaker <- str_replace_all(df_ft_all_combined_wordscore$speaker, "\\{", "")
df_ft_all_combined_wordscore$speaker <- str_replace_all(df_ft_all_combined_wordscore$speaker, "\\[", "")
df_ft_all_combined_wordscore$speaker <- str_replace_all(df_ft_all_combined_wordscore$speaker, "\\]", "")
df_ft_all_combined_wordscore$speaker <- str_replace_all(df_ft_all_combined_wordscore$speaker, "\\}", "")
df_ft_all_combined_wordscore$speaker <- str_replace_all(df_ft_all_combined_wordscore$speaker, "\\\\", "")
df_ft_all_combined_wordscore$speaker <- str_replace_all(df_ft_all_combined_wordscore$speaker, "\\*", "")
df_ft_all_combined_wordscore$speaker <- str_replace_all(df_ft_all_combined_wordscore$speaker, "\\?", "")


for (i in 1:nrow(df_ft_all_combined_wordscore)) {
  speaker <- df_ft_all_combined_wordscore$speaker[i]
  speech <- df_ft_all_combined_wordscore$speech[i]
  
  # Find the position of the speaker in speech
  speaker_position <- str_locate(speech, speaker)
  
  if (!is.na(speaker_position[1, 1])) {
    # Get the position of the first occurrence of the speaker
    start_pos <- speaker_position[1, 1]
    end_pos <- speaker_position[1, 2]
    
    # Remove the first occurrence of speaker in speech
    df_ft_all_combined_wordscore$speech[i] <- paste0(substr(speech, 1, start_pos - 1), substr(speech, end_pos + 1, nchar(speech)))
  }
}

# Iterate through each row of the data frame
for (i in 1:nrow(df_ft_all_combined_wordscore)) {
  speech <- df_ft_all_combined_wordscore$speech[i]
  
  # Check if ":" is within the first 30 characters
  if (str_detect(substr(speech, 1, 70), ":")) {
    # Remove characters up to and including the first ":"
    df_ft_all_combined_wordscore$speech[i] <- substr(speech, str_locate(speech, ":")[1, 2] + 1, nchar(speech))
  }
}


# Iterate through each row of the data frame
for (i in 1:nrow(df_ft_all_combined_wordscore)) {
  speech <- df_ft_all_combined_wordscore$speech[i]
  
  # Check if ":" is within the first 30 characters
  if (str_detect(substr(speech, 1, 70), "\\)\\.")) {
    # Remove characters up to and including the first ":"
    df_ft_all_combined_wordscore$speech[i] <- substr(speech, str_locate(speech, ":")[1, 2] + 1, nchar(speech))
  }
}


# Make duplicates of references docs
reference_docs_df$duplicate_wordscore <- 1

df_ft_all_combined_wordscore <- bind_rows(df_ft_all_combined_wordscore, reference_docs_df)

# Remove rows with wordscore that are now duplicated and should not be used for referencing
df_ft_all_combined_wordscore <- df_ft_all_combined_wordscore %>%
  filter(!(is.na(duplicate_wordscore) & !is.na(reference_wordscore)))


df_ft_all_combined_wordscore <- df_ft_all_combined_wordscore %>%
  mutate(party = ifelse(party == 7 | party == 13, paste0(party, "_ref"), party))

# Convert the 'date' column to Date class if it's not already
df_ft_all_combined_wordscore$date <- as.Date(df_ft_all_combined_wordscore$date)

# Create a new column 'post_1989' and update 'party' accordingly
df_ft_all_combined_wordscore <- df_ft_all_combined_wordscore %>%
  mutate(post_1989 = ifelse(date > as.Date("1989-11-08"), TRUE, FALSE),
         party = ifelse(post_1989, paste0(party, "_1"), party))

# Ensure the 'date' column is of class Date
df_ft_all_combined_wordscore$date <- as.Date(df_ft_all_combined_wordscore$date)

df_ft_all_combined_wordscore <- df_ft_all_combined_wordscore %>%
  mutate(party_grouped = ifelse(!is.na(reference_wordscore) & reference_wordscore == 1 & post_1989 == 1, 4,
                                ifelse(!is.na(reference_wordscore) & reference_wordscore == 1, 2,
                                       ifelse(!is.na(reference_wordscore) & reference_wordscore == -1, 3,
                                              ifelse(post_1989, 1, 0)))))

library(quanteda)

# Tokenize the 'speaker' column
tokens_speaker <- tokens(df_ft_all_combined_wordscore$speaker)

# Remove all other variables except "party" and "speech" and "reference_wordscore"
df_ft_all_combined_wordscore <- df_ft_all_combined_wordscore %>%
  select(party_grouped, party, speech, reference_wordscore, duplicate_wordscore)

#Group by grouped_party
grouped_df_wordscore <- df_ft_all_combined_wordscore %>%
  group_by(party_grouped) %>%
  summarize(concatenated_speech = paste(speech, collapse = " "), 
            reference_wordscore = first(reference_wordscore))

# Reset the row names
rownames(grouped_df_wordscore) <- NULL

corp1 <- corpus(grouped_df_wordscore, text_field = "concatenated_speech")
docnames(corp1) <- grouped_df_wordscore$party_grouped

toks_final <- tokens(corp1, remove_punct = TRUE)

# Convert the list of speaker tokens into a character vector
tokens_vector <- unlist(tokens_speaker)

# Remove the speaker tokens from the dfm
dfmat_final <- dfm(toks_final) %>% 
  dfm_remove(pattern = tokens_vector) %>%
  dfm_remove(pattern = stopwords("da"))

tmod_ws <- textmodel_wordscores(dfmat_final, y = corp1$reference_wordscore, smooth = 1)
summary(tmod_ws)

pred_ws <- predict(tmod_ws, se.fit = TRUE, newdata = dfmat_final)
pred_ws

pred_ws$fit <- pred_ws$fit
pred_ws$se.fit <- pred_ws$se.fit

textplot_scale1d(pred_ws)

setwd("FOLDER PATH")
save(pred_ws, file = "wordscores_combined_together.RData")
